*************************************************************
*************************************************************
*                                                           *
*        Part 1: Merge Raw Data Files                       *
*                                                           *
*************************************************************
*************************************************************

cd ".\replication_folder"

******* 1. Coal Production **********************************

*** 1.1 Merge annual files from EIA

xmluse  ".\replication_folder\coalpublic1984.xml", sheet("Hist_Coal_Prod") clear doctype(excel) firstrow cells(A4:P10000) allstring 
save production, replace

qui forvalues i = 1985/2012 {
xmluse  ".\replication_folder\coalpublic`i'.xml", sheet("Hist_Coal_Prod") clear doctype(excel) firstrow cells(A4:P10000) allstring 
append using production
save production, replace
}
rename var14 Production
save production, replace

qui forvalues i = 2013/2020 {

import excel ".\replication_folder\coalpublic`i'.xls", sheet("Hist_Coal_Prod") cellrange(A4) firstrow allstring clear
rename (MSHAID	MineName	MineState	MineCounty	MineStatus	MineType	CompanyType	OperationType	OperatingCompany	OperatingCompanyAddress	UnionCode	CoalSupplyRegion	Productionshorttons	AverageEmployees	LaborHours)  ///
       (MSHA_ID	Mine_Name	Mine_State	Mine_County	Mine_Status	Mine_Type	Company_Type	Operation_Type	Operating_Company	Operating_Company_Address	Union_Code	Coal_Supply_Region	Production	Average_Employees	Labor_Hours)
append using production
save production, replace

}

destring Year Production-Labor_Hours, replace force
compress

save coal_raw, replace

*** 1.2 Collpase to mine (msha_id) level

use coal_raw, clear

renvarlab  *, lower
drop if year<2001
drop if year==.
 
replace mine_state = "" if mine_state=="Refuse Recovery"          // 431 observations have mine_state=="Refuse Recovery".
bys msha_id (year): carryforward mine_state, replace              // (152 real changes made)
gsort msha_id -year
bys msha_id: carryforward mine_state, replace                     // (44 real changes made)

egen x = group(msha_id year)
qui unique mine_status, by(x) gen(nn)
tab nn                                                             // Only one mine has two different status in the same year.year	msha_id, (2011	4606736), which are duplicated observations with entry errors.
drop if mine_status != "Active" & msha_id=="4606736" & year==2011  // Drop the wrong obervation

gsort msha_id year -average_employees

collapse (firstnm) mine_name-mine_status company_type-coal_supply_region mine_basin (sum) production-labor_hours, by(msha_id year)

save coal_msha, replace

*** 1.3 Add county id

use coal_msha, clear

replace mine_county="Claiborne" if mine_county=="Clairborne" & mine_state=="Tennessee"
replace mine_county="Athens" if mine_county=="Athans" & mine_state=="Ohio"
replace mine_county="yukon koyukuk" if mine_county=="Yukon-Koyukuk" & mine_state=="Alaska"
replace mine_state="West Virginia" if mine_county=="Monongalia" & mine_state=="Pennsylvania (Bituminous)" 

split mine_state , parse(" (" ")")  

g sname = strltrim(mine_state1) 
replace sname = strrtrim(sname) 

g cname = strltrim(mine_county) 
replace cname = strrtrim(cname) 

merge m:1 sname cname using countyfipstool20190120   // 236 from coal_msha do not merge 
drop if _merge==2

split operating_company_address if _merge==1, parse(", ")  
replace sab = substr(operating_company_address3,1,2) if _merge==1 & operating_company_address4==""  // (225 real changes made)
replace sab = substr(operating_company_address4,1,2) if _merge==1 & operating_company_address4!=""  // (9 real changes made)
bys operating_company: carryforward sab  if _merge==1, replace   // 2 observations have incomplete addresses. So use the sab of the same company; (2 real changes made)

drop mine_state1 mine_state2 _merge operating_company_address1-operating_company_address4

merge m:1 sab cname using countyfipstool20190120 , update  // (missing updated                   211)  25 are not matched due to company address changes or operating company and mine are not in the same state 
drop if _merge==2
replace sab="" if _merge==1                                // (25 real changes made)
sort msha_id year

bys msha_id: carryforward sname sab-fips  if _merge==1, replace //  (9 real changes made)
gsort msha_id -year
bys msha_id: carryforward sname sab-fips  if _merge==1, replace  //  (1 real change made) 15 mines' location cannot be identified, as the county of mine is inconsistent with the state of operating company and the state of mine is missing.
drop _merge

sort msha_id year

save coal_fips, replace

*** 1.4 Collapse data to the county level

use coal_fips, clear

egen group = group(fips year)
qui unique msha_id , by(group) gen(n_all)
qui unique msha_id if mine_status=="Permanently abandoned", by(group) gen(n_close)
qui unique msha_id if mine_status!="Permanently abandoned", by(group) gen(n_active)

gen name = subinstr(operating_company,"Coal","COAL",.)
split name, parse(, " Inc" " Llc" " Corp" " Inc" " Co." " Company" " Co" " L L C")
g company_name = strtrim(name1)
unique company_name , by(group) gen(n_company) 

collapse (firstnm) sname-cfips n_* (sum) production-labor_hours, by(fips year)
drop if fips==.           // (10 observations deleted)

xtset fips year

save coal_county, replace


****** 2. County-level Annual Economic variables **********  

*** 2.1 Unemployment rate from BLS for PSM

use la_county1, clear

g year = year(dofm(ym))
destring countycode, gen(fips)

collapse (last) un_rate-labor_force, by(fips year)

save unrate_county, replace

*** 2.2 Personal income from BEA for PSM

import delimited ".\replication_folder\CAINC1__ALL_AREAS_1969_2020.csv", clear numericcols(9(1)60)

drop if linecode==.
keep if linecode==3
g fips = substr(geofips,3,5) 
destring fips, replace

forvalues i = 9/60 {
	local j = `i' - 9 + 1969
	rename v`i' pcpi`j'
}

keep fips pcpi*
reshape long pcpi, i(fips) j(year)

save pcpi_county, replace

*** 2.3 Census population by age, sex, and race (for PSM and as an outcome measure)

import delimited ".\replication_folder\cc-est2010-alldata.csv", clear 
drop if inlist(year,1,2,13,14)   // There seems to be an error row with year=13 that does not correspond to any year; 14 is 2010 which is covered in the next file
replace year = year - 12 + 2009
tempfile temp
save `temp'

import delimited ".\replication_folder\CC-EST2020-ALLDATA.csv", clear numericcols(8(1)80)
drop if inlist(year,1,2)
replace year = year - 3 + 2010
append using `temp'

sort state county year agegrp
gen str2 State = string(state,"%02.0f")
gen str3 County = string(county,"%03.0f")
gen fips = State+County
destring fips, replace

g age_grp = 0 if agegrp==0
replace age_grp = 1 if agegrp>=1 & agegrp<=4   // 0-19 
replace age_grp = 2 if agegrp>=5 & agegrp<=13  // 20-64
replace age_grp = 3 if agegrp>=14              // 65-

collapse (sum) tot_pop-ba_female h_male h_female, by(fips year age_grp)

egen id = group(fips year)

reshape wide tot_pop-h_female, i(id) j(age_grp)
drop id

save age_race_county, replace


****** 3 County-level five-year ACS data ****************************
* B19081_001 and B19081_005 are the mean household income of the lowest and highest quintiles
*
* Educational attainment for population 25 years old and over using 5 years of data. The percent of the population who has completed an Associate's degree or higher is calculated by FRED by adding the following variables from the 5-year American Community Survey (ACS) percent of the population with an Associate's Degree, 
* percent of the population with a Bachelor's degree, and percent of the population with a Graduate or Professional degree. (ACS variables S1501_C02_011E, S1501_C02_012E, S1501_C02_013E from table S1501.)

global censuskey "9e44c71c7c5c849df11a6d81eda75af1b3d552ba"

getcensus catalog, product(DP) clear year(2011)
getcensus catalog, table(DP03) clear  year(2011)                   

tempfile temp
getcensus B19081_001 B19081_005   ///
B08013_001 B08012_001  ,   geography(county) year(2011) sample(5) noerror  clear                                                                   
save `temp'                                                                                    

getcensus DP04_0112 DP04_0113 DP04_0121 DP04_0122 DP04_0139 DP04_0140 DP04_0108 DP04_0115  DP04_0134                ///
DP05_0001 DP05_0003 DP05_0017 DP05_0032 DP05_0033 DP05_0065 DP03_0062 DP04_0088, geography(county) year(2011) sample(5) noerror  clear
merge 1:1 year state county using `temp'
drop _merge
save `temp', replace

getcensus S1501_C01_011 S1501_C01_012 S1501_C01_013  S1101_C01_005 S1101_C03_005 S1101_C04_005  , geography(county) year(2011) sample(5) noerror  clear                                                                                                             
merge 1:1 year state county using `temp'
drop if _merge==1

renvarlab s1501_c01_011e-b08012_001e, postdrop(1) 
renvarlab s1501_c01_011-b08012_001, upper

g inequality = B19081_005/B19081_001
g single_parent = 100*(S1101_C04_005 + S1101_C03_005)/S1101_C01_005 
g education = S1501_C01_011 + S1501_C01_012 + S1501_C01_013 

rename (DP05_0001 DP05_0003 DP05_0017 DP05_0032 DP05_0033 DP05_0065  DP03_0062 DP04_0088)          (pop_tot  female  median_age white black hispanic  median_inc median_home_value)

gen fips = state+county
destring fips, replace

keep year fips name pop_tot inequality-education female  median_age white black hispanic  median_inc median_home_value
compress

save acs_county, replace


****** 4 Mobility from Chetty et al. (2014) *************************************

use mobility, clear

g mobility_abs = e_rank_b
g mobility_rel = s_rank
rename county_id fips
keep fips mobility_abs mobility_rel

save mobility_county


****** 5 FDIC Deposits and branches **********************************

use sod, clear

renvarlab  *, lower
g fips = stcntybr

bys fips year : egen tot_cntydep = total(depsumbr) // tot $ deposits
bys fips year : egen tot_cntybra = count(depsumbr) // tot n of branches

bys fips year : egen tot_cntydep_large = total(depsumbr) if asset>1000000 
bys fips year : egen tot_cntybra_large = count(depsumbr) if asset>1000000 

collapse (firstnm) tot_*, by( fips year  )

g large_dep = 100*tot_cntydep_large/tot_cntydep
g large_bra = 100*tot_cntybra_large/tot_cntybra

save sod_county, replace


******* 6  HMDA ********************************************

*** 6.1 Merge annual HMDA data files

use hmda_2007, clear
save hmda1

qui forvalues i=2008/2017 {
use hmda_`i', clear
append using hmda1
save hmda1, replace
}

*** 6.2 Construct variables

use hmda1, clear

rename *, lower
g fips = state + county
destring fips , replace 

keep if loan_type=="1"     // Conventional loans
keep if property_type=="1" // 1-4 families
keep if lien_status=="1"   // First lien

g white = 1     if (aprace1 =="5" & coaprace1 == "8") | (aprace1 =="5" & coaprace1 == "5")
g black = 1     if (aprace1 =="3" & coaprace1 == "8") | (aprace1 =="3" & coaprace1 == "3")

g loan_amt = loanamt_recoded
drop income rate_spread
rename (hmda_year respondent_id sequence_number agency_code_recoded income_recoded rate_spread_recoded)   (year bank_id  loan_id agency_id income rate_spread )

winsor2 loan_amt rate_spread, cuts(1 99) replace

g application_count = 1 if inlist(action_type,"1","2","3","7","8")
g application_amt = loan_amt if inlist(action_type,"1","2","3","7","8")

replace loan_purpose = "3" if inlist(loan_purpose,"31","32" )
replace loan_purpose = "2" if inlist(loan_purpose,"4","5" )

foreach v in application_count application_amt  {
	g `v'_1 = `v' if  loan_purpose == "1"              // Purchase
	g `v'_2 = `v' if  loan_purpose == "2"              // Improvement
	g `v'_3 = `v' if  loan_purpose == "3"              // Refinancing
}

collapse (sum) application_count* application_amt*    , by(fips year)

save hmda_county


****** 7 Wage and employment from QCEW ***************************************

do "C:\ado\personal\loadqcew.ado.txt"
loadqcew naics, year0(2000) year1(2020) agglvl(70 71 73) frequency(annual)   // 75 is three-digit NAICS
save qcew_ssector

use qcew_ssector, clear

g s = own_code if industry_code=="10"
replace s = 6 if inlist(industry_code,"1011") & own_code==5
destring area_fips, gen(fips)

collapse (firstnm) annual_avg_emplvl total_annual_wages annual_avg_estabs avg_annual_pay if !mi(s) , by(fips s year)
rename (annual_avg_emplvl total_annual_wages annual_avg_estabs avg_annual_pay) (emp total_wage estabs avg_pay  )
reshape wide emp total_wage estabs avg_pay, i(fips year) j(s)

replace estabs6=-1*estabs6
replace emp6 = -1*emp6
replace total_wage6 = -1*total_wage6
egen estabs_non_coal = rowtotal( estabs5  estabs6)
egen emp_non_coal = rowtotal(emp5  emp6)
egen total_wage_non_coal = rowtotal(total_wage5 total_wage6 )
g avg_pay_non_coal = total_wage_non_coal/emp_non_coal
rename (emp1  estabs1 avg_pay1  ) (emp_fed estabs_fed avg_pay_fed  )        // federal
rename (emp2  estabs2 avg_pay2  ) (emp_st  estabs_st avg_pay_st  )            // state
rename (emp3  estabs3 avg_pay3  ) (emp_loc estabs_loc avg_pay_loc  )         // local
rename (emp5  estabs5 avg_pay5  ) (emp_all estabs_all avg_pay_all  )         // private

keep fips year *_fed *_st *_loc *_all *_non_coal

save qcew_county, replace


****** 8 CFPB rural or underserved county list *****************

import excel ".\replication_folder\201312_cfpb_list-to-use-in-2012.xls", sheet("ListToUseIn2012w2011data") firstrow case(lower) clear

g cfpb = 1
g year=2012
drop state county

save cfpb_county, replace


****** 9 County lat lng *****************************************

import excel ".\replication_folder\uscounties.xlsx", sheet("Sheet1") firstrow clear

rename county_fips fips 
keep fips lat lng

save lat_lng_county


****** 10 ZCTA *************************************************

use zcta, clear

destring geoid, gen(fips)
keep fips copop cohu coarea coarealand
duplicates drop

save zcta_county


****** 11 County distances *************************************

use sf12010countydistancemiles, clear

destring county1, gen(fips)

save distance


****** 12 Merge county-level data ****************************** 

use age_race_county, clear

merge m:1 fips using zcta_county
drop if _merge==2
drop _merge

merge m:1 fips using mobility_county
drop if _merge==2
drop _merge

merge m:1 fips using lat_lng_county
drop _merge

merge 1:1 fips year using cfpb_county
drop if _merge==2
drop _merge

merge 1:1 fips year using unrate_county
drop if _merge==2                        // Delete prior to 2000 and PR
drop _merge

merge 1:1 fips year using pcpi_county
drop if _merge==2                        // Delete prior to 2000
drop _merge

merge 1:1 fips year using sod_county
drop if _merge==2                        // Delete prior to 2000 and PR
drop _merge

merge 1:1 fips year using hmda_county
drop if _merge==2                       
drop _merge

merge 1:1 fips year using acs_county     
drop if _merge==2                        // Delete PR
drop _merge name

merge 1:1 fips year using qcew_county     
drop if _merge==2                        
drop _merge 

merge 1:1 fips year using coal_county  
drop _merge sname-cfips

save merged, replace
